library(dplyr)
library(ggplot2)
library(lmtest)
library(lubridate)
library(limma)
library(tidyr)
library(plotly)
library(gridExtra)
library(data.table)
library(formattable)
#Data Files and prep work
source("../../../lib/DataProccess.R")
source("../../../lib/NormFuncs.R")
source("../../../lib/OutlierDetectionFuncs.R")
source("../../../lib/DataPathName.R")
BaseDir <- params$BaseDir#get the root of the directory where the data is stored
The steps to this analysis are: Remove outliers Remove trend normalize residual find constant variance
First we look at Madison data
LIMSFullDF <- ParseData(LIMSWastePath(BaseDir))%>%
filter(Site == "Madison")%>%
select(Date, Site, N1, BCoV , N2 , PMMoV,
FlowRate,temperature,equiv_sewage_amt)
ErrorMarkedDF <- LIMSFullDF%>%#
mutate(FlagedOutliers = IdentifyOutliers(N1, Action = "Flag"),
#Manual flagging that method misses due to boundary effect with binning
FlagedOutliers = ifelse(Date == mdy("01/26/2021") |
Date == mdy("01/27/2021") |
Date == mdy("01/23/2022"),
TRUE, FlagedOutliers),
NoOutlierVar = ifelse(FlagedOutliers, NA, N1))
#Split N1 into outlier and non outlier for next ggplot
OutLierPlotDF <- ErrorMarkedDF%>%
mutate(OutlierN1 = ifelse(FlagedOutliers,N1,NA))%>%
mutate(N1 := NoOutlierVar)
OutLierPlotObject <- OutLierPlotDF%>%
filter(!(is.na(N1)&is.na(OutlierN1)))%>%
ggplot(aes(x=Date))+#Data depends on time
geom_line(aes(y=N1,
color="N1",
info = N1))+#compares Var to Cases
geom_point(aes(y=OutlierN1,
color="OutlierN1",
info = OutlierN1))
ggplotly(OutLierPlotObject,tooltip=c("info","Date"))
UpdatedDF <- ErrorMarkedDF%>%
select(-N1)%>%
rename(N1 = NoOutlierVar)
UpdatedDF$loessN1 <- loessFit(y=(UpdatedDF$N1),
x=UpdatedDF$Date, #create loess fit of the data
span=.05, #span of .2 seems to give the best result, not rigorously chosen
iterations=2)$fitted#2 iterations remove some bad patterns
SLDLoessGraphic <- UpdatedDF%>%
NoNa("loessN1")%>%
ggplot(aes(x=Date))+
geom_line(aes(y=N1,
color="N1",
info = N1),
alpha=.2)+
geom_line(aes(y = loessN1,
color="loessN1" ,
info = loessN1))+
geom_hline(yintercept = (5e4))+
geom_hline(yintercept = (1e6))
ggplotly(SLDLoessGraphic,tooltip=c("info","Date"))
DetrendedDF <- UpdatedDF%>%
mutate(DetrendedN1 = N1 - loessN1)
DetrendLoessGraphic <- DetrendedDF%>%
NoNa("DetrendedN1")%>%
ggplot(aes(x=Date))+
geom_line(aes(y=DetrendedN1,
color="DetrendedN1",
info = DetrendedN1))
ggplotly(DetrendLoessGraphic,tooltip=c("info","Date"))
mean(DetrendedDF$DetrendedN1, na.rm=TRUE)
## [1] 12278.31
library(tseries)
NormRestruct <- function(Norm){
return(mean(Norm,na.rm=TRUE)/Norm)
}
TestNorm <- function(DF,Base,Norm){
noNADF <- DF%>%
NoNa(Norm)%>%
mutate(SevDay = rollapply(!!sym(Norm),7,mean, partial = TRUE),
SevDayBase = rollapply(!!sym(Base),7,mean, partial = TRUE))
ResidDetrendLoessGraphic <- noNADF%>%
ggplot(aes(x=Date))+
geom_point(aes(y=!!sym(Base),
color=Base),
size = .5)+
geom_point(aes(y=!!sym(Norm),
color=Norm),
size = .5)+
geom_line(aes(y=SevDayBase,
color="SevDayBase"))+
geom_line(aes(y=SevDay,
color="SevDay"))
print(adf.test(noNADF[[Norm]]))
ggplotly(ResidDetrendLoessGraphic,tooltip=c("y","Date"))
}
ResidDetrendedDF <- DetrendedDF%>%
mutate(N1NormedResid = DetrendedN1*NormRestruct(loessN1),
sqrtNormedResid = DetrendedN1*NormRestruct(sqrt(loessN1)),
logNormedResid = DetrendedN1*NormRestruct(log(loessN1)),
xlogNormedResid = DetrendedN1*NormRestruct(loessN1*log(loessN1)),
WeirdNormedResid = DetrendedN1*NormRestruct(sqrt(loessN1)*log(loessN1)))
TestNorm(ResidDetrendedDF,"DetrendedN1","DetrendedN1")
##
## Augmented Dickey-Fuller Test
##
## data: noNADF[[Norm]]
## Dickey-Fuller = -9.6536, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
TestNorm(ResidDetrendedDF,"DetrendedN1","N1NormedResid")
##
## Augmented Dickey-Fuller Test
##
## data: noNADF[[Norm]]
## Dickey-Fuller = -9.372, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
TestNorm(ResidDetrendedDF,"DetrendedN1","sqrtNormedResid")
##
## Augmented Dickey-Fuller Test
##
## data: noNADF[[Norm]]
## Dickey-Fuller = -9.795, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
ResidDetrendedDF <- DetrendedDF%>%
mutate(VarCompN1 = DetrendedN1**2,
N1NormedResid = VarCompN1*NormRestruct(loessN1^2),
sqrtNormedResid = VarCompN1*NormRestruct(loessN1),
logNormedResid = VarCompN1*NormRestruct(log(loessN1)^2),
xlogNormedResid = VarCompN1*NormRestruct(loessN1^2*log(loessN1)^2),
WeirdNormedResid = VarCompN1*NormRestruct(loessN1*log(loessN1)^2))
TestNorm(ResidDetrendedDF,"VarCompN1","VarCompN1")
##
## Augmented Dickey-Fuller Test
##
## data: noNADF[[Norm]]
## Dickey-Fuller = -4.2406, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
TestNorm(ResidDetrendedDF,"VarCompN1","N1NormedResid")
##
## Augmented Dickey-Fuller Test
##
## data: noNADF[[Norm]]
## Dickey-Fuller = -5.4619, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
TestNorm(ResidDetrendedDF,"VarCompN1","sqrtNormedResid")
##
## Augmented Dickey-Fuller Test
##
## data: noNADF[[Norm]]
## Dickey-Fuller = -4.5547, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
ViewTwo: Shape of the Var series
ShapeDF <- ResidDetrendedDF%>%
filter(!is.na(log(VarCompN1)),!is.na(log(N1)))
ShapeDF$GamVar <- mgcv::gam(
formula = log(VarCompN1) ~ s(log(N1), bs = "cs"), data = ShapeDF)$fitted
ShapeDF%>%
NoNa("VarCompN1")%>%
ggplot(aes(x = log(N1)))+
geom_point(aes(y=log(VarCompN1),
color="VarCompN1"),
size = .5)+
geom_line(aes(y=GamVar,
color="GamVar"))+
geom_abline(slope = 1.62603,intercept = 2.11644,color="Blue")+
geom_vline(xintercept = log(5e4))+
geom_vline(xintercept = log(1e6))+
facet_wrap(~Site)
AllSiteDF <- ParseData(LIMSWastePath(BaseDir))
SiteDFList <- split(AllSiteDF,AllSiteDF$Site)
loessFitandDetrend <- function(DF){
spanC = min(.05*393/length(DF$N1),.3)
DF$loessN1 <- loessFit(y=DF$N1,
x=DF$Date, #create loess fit of the data
span=spanC,
iterations=2)$fitted#2 iterations remove some bad patterns
RetDF <- DF%>%
mutate(DetrendedN1 = N1 - loessN1,
VarN1 = DetrendedN1^2,
SevDay = rollapply(VarN1,7,mean, partial = TRUE))
return(RetDF)
}
TempListDF <- lapply(SiteDFList,loessFitandDetrend)
FullDetrendedData <- rbindlist(TempListDF)%>%
NoNa("N1","VarN1")%>%
filter(VarN1>1)
FullDetrendedData$loessVar <- loessFit(y=log(FullDetrendedData$VarN1),
x=log(FullDetrendedData$N1), #create loess fit of the data
span=.3,
iterations=2,
method = "loess")$fitted#2 iterations remove some bad patterns
FullDetrendedData$GamVar <- mgcv::gam(
formula = log(VarN1) ~ s(log(N1), bs = "cs"),data = FullDetrendedData)$fitted
FullDetrendedData%>%
NoNa("VarN1")%>%
ggplot(aes(x = log(N1)))+
geom_point(aes(y=log(VarN1),
color="VarN1"),
size = 2,alpha=.1)+
geom_abline(slope = 1.62603,intercept = 2.11644,color="Blue")+
geom_vline(xintercept = log(5e4))+
geom_vline(xintercept = log(1e6))+
geom_line(aes(y=GamVar))
RemoveLowDF <- FullDetrendedData%>%
filter(N1>5e4,N1<1e6)
RemoveLowDF%>%
ggplot(aes(x=log(N1),y=GamVar))+
geom_point()+
geom_abline(slope = 1.635322,intercept = 2.009892)
#geom_abline(slope = 1.842089,intercept = -0.534551)
#1.862581, -0.523034
summary(lm(GamVar~log(N1),data=RemoveLowDF))
##
## Call:
## lm(formula = GamVar ~ log(N1), data = RemoveLowDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34044 -0.15823 0.04028 0.13855 0.46399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.009892 0.046776 42.97 <2e-16 ***
## log(N1) 1.635322 0.003819 428.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1935 on 3845 degrees of freedom
## Multiple R-squared: 0.9795, Adjusted R-squared: 0.9795
## F-statistic: 1.833e+05 on 1 and 3845 DF, p-value: < 2.2e-16
summary(lm(log(VarN1)~log(N1),data=RemoveLowDF))
##
## Call:
## lm(formula = log(VarN1) ~ log(N1), data = RemoveLowDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.3053 -1.1762 0.3949 1.5645 8.5448
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.11644 0.61003 3.469 0.000527 ***
## log(N1) 1.62603 0.04981 32.644 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.524 on 3845 degrees of freedom
## Multiple R-squared: 0.217, Adjusted R-squared: 0.2168
## F-statistic: 1066 on 1 and 3845 DF, p-value: < 2.2e-16
RemoveLowDF%>%
ggplot(aes(x=log(N1),y=log(VarN1) - 1.842089*log(N1)+0.534551))+
geom_point()+
geom_abline(slope = 1.842089,intercept = -0.534551)
FullDetrendedData%>%
#filter(log(N1)>11)%>%
NoNa("VarN1")%>%
ggplot(aes(x = N1))+
scale_x_log10()+
geom_point(aes(y=GamVar))
FullDetrendedData%>%
#filter(log(N1)>11)%>%
ggplot(aes(x = N1))+
scale_x_log10()+
geom_point(aes(y=GamVar-lag(GamVar)),alpha=.1,size=.5)